#!/usr/bin/env python3

"""Refresh and update language stemming data from the Snowball project."""

# /// script
# requires-python = ">=3.11"
# dependencies = [
#     "requests>=2.30",
# ]
# ///

from __future__ import annotations

import hashlib
import shutil
import subprocess
import sys
import tarfile
import tempfile
from io import BytesIO
from pathlib import Path

import requests

SNOWBALL_VERSION = '3.0.1'
SNOWBALL_URL = f'https://github.com/snowballstem/snowball/archive/refs/tags/v{SNOWBALL_VERSION}.tar.gz'
SNOWBALL_SHA256 = '80ac10ce40dc4fcfbfed8d085c457b5613da0e86a73611a3d5527d044a142d60'

ROOT = Path(__file__).resolve().parent.parent
SEARCH_DIR = ROOT / 'sphinx' / 'search'
STOPWORDS_DIR = SEARCH_DIR / '_stopwords'
NON_MINIFIED_JS_DIR = SEARCH_DIR / 'non-minified-js'

STOPWORD_URLS = (
    ('da', 'danish', 'https://snowballstem.org/algorithms/danish/stop.txt'),
    ('de', 'german', 'https://snowballstem.org/algorithms/german/stop.txt'),
    ('en', 'english', 'https://snowballstem.org/algorithms/english/stop.txt'),
    ('es', 'spanish', 'https://snowballstem.org/algorithms/spanish/stop.txt'),
    ('fi', 'finnish', 'https://snowballstem.org/algorithms/finnish/stop.txt'),
    ('fr', 'french', 'https://snowballstem.org/algorithms/french/stop.txt'),
    ('hu', 'hungarian', 'https://snowballstem.org/algorithms/hungarian/stop.txt'),
    ('it', 'italian', 'https://snowballstem.org/algorithms/italian/stop.txt'),
    ('nl', 'dutch', 'https://snowballstem.org/algorithms/dutch/stop.txt'),
    ('no', 'norwegian', 'https://snowballstem.org/algorithms/norwegian/stop.txt'),
    ('pt', 'portuguese', 'https://snowballstem.org/algorithms/portuguese/stop.txt'),
    # ('ro', 'romanian', ''),
    ('ru', 'russian', 'https://snowballstem.org/algorithms/russian/stop.txt'),
    ('sv', 'swedish', 'https://snowballstem.org/algorithms/swedish/stop.txt'),
    # ('tr', 'turkish', ''),
)


def regenerate_stopwords() -> None:
    STOPWORDS_DIR.mkdir(parents=True, exist_ok=True)
    STOPWORDS_DIR.joinpath('__init__.py').touch()

    for lang_code, lang_name, url in STOPWORD_URLS:
        content = requests.get(url, timeout=5).content
        try:
            data = content.decode('utf-8')
        except UnicodeDecodeError:
            if lang_code == 'ru':
                data = content.decode('koi8-r')
            else:
                data = content.decode('latin-1')

        # record the original source of the stopwords list
        txt_path = STOPWORDS_DIR / f'{lang_code}.txt'
        txt_path.write_text(data.rstrip() + '\n', encoding='utf-8')

        # generate the Python stopwords set
        stopwords = parse_stop_word(data)
        with (STOPWORDS_DIR / f'{lang_code}.py').open('w', encoding='utf-8') as f:
            f.write('# automatically generated by utils/generate-snowball.py\n')
            f.write(f'# from {url}\n\n')
            f.write('from __future__ import annotations\n\n')
            f.write(f'{lang_name.upper()}_STOPWORDS = frozenset(')
            if stopwords:
                f.write('{\n')
                for word in sorted(stopwords, key=str.casefold):
                    f.write(f'    {word!r},\n')
                f.write('}')
            f.write(')\n')


def parse_stop_word(source: str) -> frozenset[str]:
    """Collect the stopwords from a snowball style word list:

    .. code:: text

        list of space separated stop words | optional comment
    """
    stop_words: set[str] = set()
    for line in source.splitlines():
        stop_words.update(line.partition('|')[0].split())  # remove comment
    return frozenset(stop_words)


def regenerate_javascript() -> None:
    tmp_root = Path(tempfile.mkdtemp())

    # Download and verify the snowball release
    archive = requests.get(SNOWBALL_URL, timeout=60).content
    digest = hashlib.sha256(archive).hexdigest()
    if digest != SNOWBALL_SHA256:
        msg = (
            f'data does not match expected checksum '
            f'(expected {SNOWBALL_SHA256}, saw {digest}).'
        )
        raise RuntimeError(msg)

    # Extract the release archive
    with tarfile.TarFile.gzopen(
        'snowball.tar.gz', mode='r', fileobj=BytesIO(archive)
    ) as tar:
        tar.extractall(tmp_root, filter='data')
    snowball_root = tmp_root / f'snowball-{SNOWBALL_VERSION}'
    snowball_dist = snowball_root / 'dist'

    # Generate JS stemmer files
    cmd = ('make', '--jobs=8', 'dist_libstemmer_js')
    subprocess.run(cmd, check=True, cwd=snowball_root)
    with tarfile.open(snowball_dist / f'jsstemmer-{SNOWBALL_VERSION}.tar.gz') as tar:
        tar.extractall(snowball_dist, filter='data')

    # Copy generated JS to sphinx/search/
    NON_MINIFIED_JS_DIR.mkdir(exist_ok=True)
    js_dir = snowball_dist / f'jsstemmer-{SNOWBALL_VERSION}' / 'javascript'
    shutil.copytree(js_dir, NON_MINIFIED_JS_DIR, dirs_exist_ok=True)

    # Clean up
    shutil.rmtree(snowball_root)


if __name__ == '__main__':
    regenerate_stopwords()
    if sys.platform != 'win32':
        regenerate_javascript()
